In [ ]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns
In [ ]:
data = pd.read_csv("ai4i2020.csv")
data.replace("?",np.nan,inplace=True)
for column in data.columns:
try:
data[column] = data[column].astype(float)
except:
pass
data.drop(['UDI', 'Product ID'],axis=1,inplace=True)
data['Machine failure']=0
data['Machine failure'][data['TWF']==1]=1
data['Machine failure'][data['HDF']==1]=2
data['Machine failure'][data['PWF']==1]=3
data['Machine failure'][data['OSF']==1]=4
data['Machine failure'][data['RNF']==1]=5
data.drop(['TWF','HDF','PWF','OSF','RNF'],axis=1,inplace=True)
data['Power'] = data['Rotational speed [rpm]'] * data['Torque [Nm]']
data['Temperature difference'] = data['Process temperature [K]'] - data['Air temperature [K]']
data = data[[
'Machine failure',
'Type',
'Air temperature [K]',
'Process temperature [K]',
'Rotational speed [rpm]',
'Torque [Nm]',
'Tool wear [min]',
'Power',
'Temperature difference'
]]
data.describe(include='all')
display(data)
| Machine failure | Type | Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | Power | Temperature difference | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | M | 298.1 | 308.6 | 1551.0 | 42.8 | 0.0 | 66382.8 | 10.5 |
| 1 | 0 | L | 298.2 | 308.7 | 1408.0 | 46.3 | 3.0 | 65190.4 | 10.5 |
| 2 | 0 | L | 298.1 | 308.5 | 1498.0 | 49.4 | 5.0 | 74001.2 | 10.4 |
| 3 | 0 | L | 298.2 | 308.6 | 1433.0 | 39.5 | 7.0 | 56603.5 | 10.4 |
| 4 | 0 | L | 298.2 | 308.7 | 1408.0 | 40.0 | 9.0 | 56320.0 | 10.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | 0 | M | 298.8 | 308.4 | 1604.0 | 29.5 | 14.0 | 47318.0 | 9.6 |
| 9996 | 0 | H | 298.9 | 308.4 | 1632.0 | 31.8 | 17.0 | 51897.6 | 9.5 |
| 9997 | 0 | M | 299.0 | 308.6 | 1645.0 | 33.4 | 22.0 | 54943.0 | 9.6 |
| 9998 | 0 | H | 299.0 | 308.7 | 1408.0 | 48.5 | 25.0 | 68288.0 | 9.7 |
| 9999 | 0 | M | 299.0 | 308.7 | 1500.0 | 40.2 | 30.0 | 60300.0 | 9.7 |
10000 rows × 9 columns
In [ ]:
data.head()
Out[ ]:
| Machine failure | Type | Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | Power | Temperature difference | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | M | 298.1 | 308.6 | 1551.0 | 42.8 | 0.0 | 66382.8 | 10.5 |
| 1 | 0 | L | 298.2 | 308.7 | 1408.0 | 46.3 | 3.0 | 65190.4 | 10.5 |
| 2 | 0 | L | 298.1 | 308.5 | 1498.0 | 49.4 | 5.0 | 74001.2 | 10.4 |
| 3 | 0 | L | 298.2 | 308.6 | 1433.0 | 39.5 | 7.0 | 56603.5 | 10.4 |
| 4 | 0 | L | 298.2 | 308.7 | 1408.0 | 40.0 | 9.0 | 56320.0 | 10.5 |
In [ ]:
data.shape
Out[ ]:
(10000, 9)
In [ ]:
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
In [ ]:
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
data.describe().T
Out[ ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Machine failure | 10000.0 | 0.09900 | 0.561988 | 0.0 | 0.0 | 0.0 | 0.00 | 5.0 |
| Air temperature [K] | 10000.0 | 300.00493 | 2.000259 | 295.3 | 298.3 | 300.1 | 301.50 | 304.5 |
| Process temperature [K] | 10000.0 | 310.00556 | 1.483734 | 305.7 | 308.8 | 310.1 | 311.10 | 313.8 |
| Rotational speed [rpm] | 10000.0 | 1538.77610 | 179.284096 | 1168.0 | 1423.0 | 1503.0 | 1612.00 | 2886.0 |
| Torque [Nm] | 10000.0 | 39.98691 | 9.968934 | 3.8 | 33.2 | 40.1 | 46.80 | 76.6 |
| Tool wear [min] | 10000.0 | 107.95100 | 63.654147 | 0.0 | 53.0 | 108.0 | 162.00 | 253.0 |
| Power | 10000.0 | 59967.14704 | 10193.093881 | 10966.8 | 53105.4 | 59883.9 | 66873.75 | 99980.4 |
| Temperature difference | 10000.0 | 10.00063 | 1.001094 | 7.6 | 9.3 | 9.8 | 11.00 | 12.1 |
In [ ]:
data.select_dtypes(include=['object']).describe().T
Out[ ]:
| count | unique | top | freq | |
|---|---|---|---|---|
| Type | 10000 | 3 | L | 6000 |
In [ ]:
data.columns
Out[ ]:
Index(['Machine failure', 'Type', 'Air temperature [K]',
'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
'Tool wear [min]', 'Power', 'Temperature difference'],
dtype='object')
In [ ]:
num_cols = ['Air temperature [K]', 'Process temperature [K]',
'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
cat_cols = ['Type', 'Failure type']
label = 'Machine failure'
In [ ]:
data['Air temperature [K]'][data['Air temperature [K]'].isna().astype(int) == 1]
Out[ ]:
Series([], Name: Air temperature [K], dtype: float64)
In [ ]:
plt.figure(figsize=(12, 12))
for i, col in enumerate(num_cols):
plt.subplot(3, 2, i+1)
sns.histplot(data, x=col, kde=True, alpha=0.2, color='red', bins=15)
plt.title(col)
plt.suptitle("Data Distributions", fontsize=15)
plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
plt.subplot(2, 3, i+1)
sns.rugplot(data, x=col, hue=label, height=0.1)
sns.boxplot(data, x=col, width=0.25)
plt.suptitle("Data Distributions")
plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
plt.subplot(2, 3, i+1)
sns.boxplot(data, x=label, y=col, width=0.5)
plt.suptitle("Data Distribution in Relation to Machine Failure")
plt.tight_layout()
plt.show()
In [ ]:
#plt.figure(figsize = (6,6))
# , square = True, annot = True, cmap = 'Blues', linewidths = 0.5)
sns.heatmap(data[num_cols].corr(), annot=True, fmt=".2f")
plt.title("Heatmap Analysis")
plt.show()
In [ ]:
data[num_cols].corr()
Out[ ]:
| Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | |
|---|---|---|---|---|---|
| Air temperature [K] | 1.000000 | 0.876107 | 0.022670 | -0.013778 | 0.013853 |
| Process temperature [K] | 0.876107 | 1.000000 | 0.019277 | -0.014061 | 0.013488 |
| Rotational speed [rpm] | 0.022670 | 0.019277 | 1.000000 | -0.875027 | 0.000223 |
| Torque [Nm] | -0.013778 | -0.014061 | -0.875027 | 1.000000 | -0.003093 |
| Tool wear [min] | 0.013853 | 0.013488 | 0.000223 | -0.003093 | 1.000000 |
In [ ]:
data.plot.hexbin(x='Air temperature [K]', y='Process temperature [K]',
gridsize=20, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Process Temperature and Air Temperature")
plt.show()
In [ ]:
data.plot.hexbin(x='Rotational speed [rpm]', y='Torque [Nm]',
gridsize=30, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Torque and Rotational speed")
plt.show()
In [ ]:
type_machine_failure = data[['Type', 'Machine failure']].pivot_table(index='Type', columns='Machine failure', aggfunc= lambda x: len(x), margins = True)
print(type_machine_failure)
plt.figure(figsize=(6,6))
sns.heatmap(type_machine_failure, annot=True, fmt='g', cmap='Blues', cbar=False, linewidths=0.5)
plt.title("Type vs Machine Failure")
plt.show()
Machine failure 0 1 2 3 4 5 All Type H 979 6 8 4 2 4 1003 L 5757 24 68 51 87 13 6000 M 2916 12 30 28 9 2 2997 All 9652 42 106 83 98 19 10000
In [ ]:
from pycaret.classification import *
s = setup(data, target = 'Machine failure', session_id = 42, data_split_stratify=True)
In [ ]:
best_model = compare_models(sort = 'AUC')
| Initiated | . . . . . . . . . . . . . . . . . . | 12:06:02 |
|---|---|---|
| Status | . . . . . . . . . . . . . . . . . . | Loading Dependencies |
| Estimator | . . . . . . . . . . . . . . . . . . | Compiling Library |
In [ ]:
plt.figure(figsize = (4,3))
plot_model(best_model, plot = 'confusion_matrix')
In [ ]:
plt.figure(figsize = (5,4))
plot_model(best_model, plot = 'auc')
In [ ]:
plt.figure(figsize = (5, 4))
plot_model(best_model, plot = 'learning')
In [ ]:
plot_model(best_model, plot = 'feature')
In [ ]:
save_model(best_model, "ai4i2020_pycaret_model")
Transformation Pipeline and Model Successfully Saved
Out[ ]:
(Pipeline(memory=Memory(location=None),
steps=[('numerical_imputer',
TransformerWrapper(exclude=None,
include=['Air temperature [K]',
'Process temperature [K]',
'Rotational speed [rpm]',
'Torque [Nm]', 'Tool wear [min]',
'Power',
'Temperature difference'],
transformer=SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
keep_empty_features=False,
missing_values=n...
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight=None, criterion='gini',
max_depth=None, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
monotonic_cst=None, n_estimators=100,
n_jobs=-1, oob_score=False,
random_state=42, verbose=0,
warm_start=False))],
verbose=False),
'ai4i2020_pycaret_model.pkl')
In [ ]:
#plot_model(best_model, plot = 'calibration') # NOT WORKING
In [ ]:
calibrated_model = calibrate_model(best_model)
In [ ]:
#plot_model(calibrated_model, plot = 'calibration') # NOT WORKING
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[65], line 1 ----> 1 plot_model(calibrated_model, plot = 'calibration') # NOT WORKING File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\utils\generic.py:964, in check_if_global_is_not_none.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 962 if globals_d[name] is None: 963 raise ValueError(message) --> 964 return func(*args, **kwargs) File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\classification\functional.py:1725, in plot_model(estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, verbose, display_format) 1611 @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) 1612 def plot_model( 1613 estimator, (...) 1622 display_format: Optional[str] = None, 1623 ) -> Optional[str]: 1624 """ 1625 This function analyzes the performance of a trained model on holdout set. 1626 It may require re-training the model in certain cases. (...) 1722 1723 """ -> 1725 return _CURRENT_EXPERIMENT.plot_model( 1726 estimator=estimator, 1727 plot=plot, 1728 scale=scale, 1729 save=save, 1730 fold=fold, 1731 fit_kwargs=fit_kwargs, 1732 plot_kwargs=plot_kwargs, 1733 groups=groups, 1734 verbose=verbose, 1735 display_format=display_format, 1736 ) File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\classification\oop.py:2071, in ClassificationExperiment.plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, verbose, display_format) 1957 def plot_model( 1958 self, 1959 estimator, (...) 1968 display_format: Optional[str] = None, 1969 ) -> Optional[str]: 1970 """ 1971 This function analyzes the performance of a trained model on holdout set. 1972 It may require re-training the model in certain cases. (...) 2068 2069 """ -> 2071 return super().plot_model( 2072 estimator=estimator, 2073 plot=plot, 2074 scale=scale, 2075 save=save, 2076 fold=fold, 2077 fit_kwargs=fit_kwargs, 2078 plot_kwargs=plot_kwargs, 2079 groups=groups, 2080 verbose=verbose, 2081 display_format=display_format, 2082 ) File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\internal\pycaret_experiment\tabular_experiment.py:2045, in _TabularExperiment.plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, feature_name, label, verbose, display_format) 1933 def plot_model( 1934 self, 1935 estimator, (...) 1946 display_format: Optional[str] = None, 1947 ) -> Optional[str]: 1948 """ 1949 This function takes a trained model object and returns a plot based on the 1950 test / hold-out set. The process may require the model to be re-trained in (...) 2043 2044 """ -> 2045 return self._plot_model( 2046 estimator=estimator, 2047 plot=plot, 2048 scale=scale, 2049 save=save, 2050 fold=fold, 2051 fit_kwargs=fit_kwargs, 2052 plot_kwargs=plot_kwargs, 2053 groups=groups, 2054 feature_name=feature_name, 2055 label=label, 2056 verbose=verbose, 2057 display_format=display_format, 2058 ) File c:\ProgramData\anaconda3\Lib\site-packages\pycaret\internal\pycaret_experiment\tabular_experiment.py:427, in _TabularExperiment._plot_model(self, estimator, plot, scale, save, fold, fit_kwargs, plot_kwargs, groups, feature_name, label, verbose, system, display, display_format) 425 if self.is_multiclass: 426 if plot in multiclass_not_available: --> 427 raise ValueError( 428 "Plot Not Available for multiclass problems. Please see docstring for list of available Plots." 429 ) 431 # exception for CatBoost 432 # if "CatBoostClassifier" in str(type(estimator)): 433 # raise ValueError( (...) 436 437 # checking for auc plot 438 if not hasattr(estimator, "predict_proba") and plot == "auc": ValueError: Plot Not Available for multiclass problems. Please see docstring for list of available Plots.
In [ ]:
automl()
Out[ ]:
CalibratedClassifierCV(cv=5, ensemble=True,
estimator=RandomForestClassifier(bootstrap=True,
ccp_alpha=0.0,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='sqrt',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
monotonic_cst=None,
n_estimators=100,
n_jobs=-1,
oob_score=False,
random_state=42,
verbose=0,
warm_start=False),
method='sigmoid', n_jobs=None)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CalibratedClassifierCV(cv=5, ensemble=True,
estimator=RandomForestClassifier(bootstrap=True,
ccp_alpha=0.0,
class_weight=None,
criterion='gini',
max_depth=None,
max_features='sqrt',
max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
monotonic_cst=None,
n_estimators=100,
n_jobs=-1,
oob_score=False,
random_state=42,
verbose=0,
warm_start=False),
method='sigmoid', n_jobs=None)RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='sqrt',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
monotonic_cst=None, n_estimators=100, n_jobs=-1,
oob_score=False, random_state=42, verbose=0,
warm_start=False)RandomForestClassifier(n_jobs=-1, random_state=42)
In [ ]:
#create_app(best_model)